 # -*- coding: utf-8 -*-
############# necessary files

import zhon.hanzi
import string
import codecs
import re
import numpy as np
import itertools
import os

############## parsing part
pp = string.punctuation+string.digits+zhon.hanzi.punctuation
pp = list (pp)

## uncommend the next two lines if importing is wrong
# import sys
# sys.path.insert(0,  '../')
dirname = os.path.dirname(__file__)
filename = os.path.join(dirname, '../supporting/stopwords1.txt')
stopwords = codecs.open(filename, 'r', encoding='utf-8').readlines()[0]
# stopwords = codecs.open("../supporting/stopwords1.txt", 'r', encoding='utf-8').readlines()[0]
# stopwords = codecs.open("/Users/han/Codes/CASM_replication_codes_AWS/lib/stopwords1.txt", 'r', encoding='utf-8').readlines()[0]
stopwords = stopwords.strip().split(",")
stopwords = list(set(stopwords))
#stopwords = [x.decode("utf-8") for x in stopwords]

stop = pp + stopwords
stop = {x:0 for x in stop}


acceptlist  = {"n":"名词","np":"人名","ns":"地名","ni":"机构名","nz":"其它专名","m":"数词","q":"量词","mq":"数量词","t":"时间词","f":"方位词",
"s":"处所词","v":"动词","vm":"能愿动词","vd":"趋向动词","a":"形容词","d":"副词","i":"习语","j":"简称","g":"语素","w":"标点","id":'长'}
#filtrate = re.compile(u'[^\u4E00-\u9FA5A-Za-z0-9_]')#保留中文字,字母
filtrate = re.compile(u'[^\u4E00-\u9FA5]')## 只保留中文

###########


def plot_precision_recall(true_label, prob, prop_positive, label):

	"""plot precision---recall curve


	Args:
		true_label (list): a list of true labels
		prob (list): a list of predicted probabilities
		prop_positive (float): the proportion of positives in the test data; 
		label (float): the caption of the figure
	"""
	fps, tps, thresholds = _binary_clf_curve(true_label, prob)

	precision = tps / (tps + fps)
	num_protest = tps + fps
	recall = tps / tps[-1]

	## first, plot precision/recall curve
	plt.plot(recall, precision, 'b-', label = "SVM")
	plt.xlabel("recall")
	plt.ylabel("precision")


	auc = metrics.auc(recall, precision)
	nor_area = abs ( (precision[-1] - precision[1]) * (recall[-1] - recall[1]))
	#print precision
	#print recall
	print "auc for precision/recall raw  ", auc
	print "auc for precision/recall ", auc / nor_area
	print "auc for ROC ", roc_auc_score(true_label, prob)

	plt.axhline(y= prop_positive, label = label)
	plt.legend(bbox_to_anchor=(0.1, 0.3), loc=2, borderaxespad=0.)
	plt.title(label)
	plt.show()

	# ## next, plot precison agains number of predicted protest events
	# plt.plot(precision, num_protest, 'b-', label = "SVM")
	# plt.xlabel("precision")
	# plt.ylabel("Number of Predicted Protest")
	# plt.title("Precision vs. number of predicted protests")
	# plt.show()



